Mock Version: 3.5 Mock Version: 3.5 Mock Version: 3.5 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-354202-71029/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=982gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: aarch64 Building for target aarch64 setting SOURCE_DATE_EPOCH=1708214400 Wrote: /builddir/build/SRPMS/python-xformers-0.0.24-1.an23.src.rpm Child return code was: 0 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-354202-71029/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=982gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: aarch64 Building for target aarch64 setting SOURCE_DATE_EPOCH=1708214400 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.an170r + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf xformers-0.0.24 + /usr/lib/rpm/rpmuncompress -x /builddir/build/SOURCES/xformers-0.0.24.tar.gz + STATUS=0 + '[' 0 -ne 0 ']' + cd xformers-0.0.24 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + rm -rf xformers.egg-info + RPM_EC=0 ++ jobs -p + exit 0 Executing(%build): /bin/sh -e /var/tmp/rpm-tmp.GVl65G + umask 022 + cd /builddir/build/BUILD + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CFLAGS + CXXFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CXXFLAGS + FFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd xformers-0.0.24 + export 'NVCC_FLAGS= --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all' + NVCC_FLAGS=' --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all' + export FORCE_CUDA=1 + FORCE_CUDA=1 + export MAX_JOBS=4 + MAX_JOBS=4 + export CUDA_HOME=/usr/local/cuda-12.1 + CUDA_HOME=/usr/local/cuda-12.1 + export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64/ + LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64/ + export 'TORCH_CUDA_ARCH_LIST=5.0;5.2;6.0;6.1;7.0;7.5;8.0;8.6;9.0' + TORCH_CUDA_ARCH_LIST='5.0;5.2;6.0;6.1;7.0;7.5;8.0;8.6;9.0' + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + /usr/bin/python3 setup.py build '--executable=/usr/bin/python3 -s' No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-12.1' fatal: not a git repository (or any of the parent directories): .git Looks like we are using CUDA 12.1 which segfaults when provided with the -generate-line-info flag. Disabling it. Looks like we are using CUDA 12.1 which segfaults when provided with the -generate-line-info flag. Disabling it. running build running build_py creating build creating build/lib.linux-aarch64-cpython-310 creating build/lib.linux-aarch64-cpython-310/xformers copying xformers/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/_cpp_lib.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/_deprecation_warning.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/attn_bias_utils.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/checkpoint.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/info.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/test.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/utils.py -> build/lib.linux-aarch64-cpython-310/xformers creating build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_attn_decoding.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_blocksparse_transformers.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_causal_blocksparse.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_core.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_indexing.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mem_eff_attention.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mem_eff_attn_decoder.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_multi_head_dispatch.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_nystrom_utils.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_revnet.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_sddmm.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_sequence_parallel_fused.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_sp24.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_swiglu.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_tiled_matmul.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_transformer.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_blocksparse.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_dropout.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_fused_linear.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_layernorm.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/utils.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks creating build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/activations.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/input_projection.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/multi_head_dispatch.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/patch_embedding.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/residual.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/reversible.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/simplicial_embedding.py -> build/lib.linux-aarch64-cpython-310/xformers/components creating build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/block_configs.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/block_factory.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/hydra_helper.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/model_factory.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/weight_init.py -> build/lib.linux-aarch64-cpython-310/xformers/factory creating build/lib.linux-aarch64-cpython-310/xformers/helpers copying xformers/helpers/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/helpers copying xformers/helpers/hierarchical_configs.py -> build/lib.linux-aarch64-cpython-310/xformers/helpers copying xformers/helpers/test_utils.py -> build/lib.linux-aarch64-cpython-310/xformers/helpers copying xformers/helpers/timm_sparse_attention.py -> build/lib.linux-aarch64-cpython-310/xformers/helpers creating build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/common.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/differentiable_collectives.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/indexing.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/modpar_layers.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/rmsnorm.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/rope_padded.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/seqpar.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/sequence_parallel_fused_ops.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/sp24.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/swiglu_op.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/tiled_matmul.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/unbind.py -> build/lib.linux-aarch64-cpython-310/xformers/ops creating build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/api.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/device_limits.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/profiler.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/profiler_dcgm.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/profiler_dcgm_impl.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/slow_ops_profiler.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler creating build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/_csr_ops.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/blocksparse_tensor.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/csr_tensor.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/utils.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse creating build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/dropout.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/fused_linear_layer.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_activations.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_dropout.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_fused_matmul_bw.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_fused_matmul_fw.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_layer_norm.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/layer_norm.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/vararg_kernel.py -> build/lib.linux-aarch64-cpython-310/xformers/triton creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/bert_padding.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_interface.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_triton.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_triton_og.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_blocksparse_attention.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_blocksparse_attn_interface.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/fused_softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn creating build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/batch_fetch_results.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/batch_submit.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_grid_search.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_tasks.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_with_submitit.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA creating build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/dataset.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/model_wrapper.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code creating build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/_sputnik_sparse.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/attention_mask.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/attention_patterns.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/base.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/blocksparse.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/compositional.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/core.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/favor.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/fourier_mix.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/global_tokens.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/lambda_layer.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/linformer.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/local.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/nystrom.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/ortho.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/pooling.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/random.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/scaled_dot_product.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/sparsity_config.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/utils.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/visual.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention creating build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/base.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/conv_mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/fused_mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/mixture_of_experts.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward creating build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/base.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/param.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/rotary.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/sine.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/vocab.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding creating build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/base.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps creating build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/k_index_select_cat.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/k_scaled_index_add.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/rmsnorm_kernels.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/rope_padded_kernels.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/sequence_parallel_fused_kernels.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/tiled_matmul_kernels.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton creating build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/attn_bias.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/common.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/cutlass.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/decoder.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/dispatch.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/flash.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/small_k.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/triton.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/triton_splitk.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/patch_embed.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/rotary.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/losses copying xformers/_flash_attn/losses/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/losses copying xformers/_flash_attn/losses/cross_entropy.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/losses creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/baichuan.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/bert.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/bigcode.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/falcon.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gpt.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gpt_neox.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gptj.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/llama.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/opt.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/vit.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/block.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/embedding.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/mha.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/activations.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/fused_dense.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/layer_norm.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/rms_norm.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/benchmark.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/distributed.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/generation.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/pretrained.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/cross_entropy.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/k_activations.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/layernorm.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/linear.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/rotary.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton running build_ext /usr/lib64/python3.10/site-packages/torch/utils/cpp_extension.py:398: UserWarning: There are no g++ version bounds defined for CUDA version 12.1 warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}') building 'xformers._C_flashattention' extension creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310 creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24 creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src Emitting ninja build file /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/build.ninja... Compiling objects... Using envvar MAX_JOBS (4) as the number of workers... [1/49] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘void set_params_fprop(Flash_fwd_params&, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t, at::Tensor, at::Tensor, at::Tensor, at::Tensor, void*, void*, void*, void*, void*, float, float, int, int)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:48:11: warning: ‘void* memset(void*, int, size_t)’ clearing an object of non-trivial type ‘struct Flash_fwd_params’; use assignment or value-initialization instead [-Wclass-memaccess] 48 | memset(¶ms, 0, sizeof(params)); | ~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:13: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash.h:51:8: note: ‘struct Flash_fwd_params’ declared here 51 | struct Flash_fwd_params : public Qkv_params { | ^~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_fwd(at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, float, float, bool, int, int, bool, c10::optional)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:347:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 347 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_varlen_fwd(const at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, const at::Tensor&, const at::Tensor&, c10::optional&, int, int, float, float, bool, bool, int, int, bool, c10::optional)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:541:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 541 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_bwd(const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, c10::optional&, c10::optional&, float, float, bool, int, int, c10::optional, c10::optional&)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:751:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 751 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_varlen_bwd(const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, c10::optional&, c10::optional&, const at::Tensor&, const at::Tensor&, int, int, float, float, bool, bool, int, int, c10::optional, c10::optional&)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:973:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 973 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_fwd_kvcache(at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, c10::optional&, c10::optional&, c10::optional&, c10::optional&, c10::optional&, c10::optional&, float, bool, int, int, bool, int)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:1167:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 1167 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ [2/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 51 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 51 registers [3/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 51 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 51 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 800 bytes cmem[0] [4/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 800 bytes cmem[0] [5/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 47 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 47 registers, 800 bytes cmem[0] [6/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 192 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 224 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 216 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 212 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 208 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 212 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 208 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 800 bytes cmem[0] [7/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 800 bytes cmem[0] [8/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers [9/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 212 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 208 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 192 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 224 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 216 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 212 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 208 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 800 bytes cmem[0] [10/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 96 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 108 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 96 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 108 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers [11/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] [12/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 800 bytes cmem[0] [13/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 96 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 108 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 96 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 108 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers [14/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 52 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 32 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 88 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 132 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 88 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 132 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] [15/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 52 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 32 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 35 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 35 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 88 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 132 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 88 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 132 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 800 bytes cmem[0] [16/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 32 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 32 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 104 bytes stack frame, 120 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 112 bytes stack frame, 136 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 160 bytes stack frame, 192 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 168 bytes stack frame, 200 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers [17/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 32 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 32 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 104 bytes stack frame, 120 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 112 bytes stack frame, 136 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 160 bytes stack frame, 192 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 168 bytes stack frame, 200 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers [18/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 144 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 264 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 228 bytes spill stores, 260 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 440 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 108 bytes spill stores, 272 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 80 bytes stack frame, 100 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 252 bytes spill stores, 300 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 52 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 64 bytes spill stores, 140 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 104 bytes stack frame, 176 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 88 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 296 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 224 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 240 bytes stack frame, 392 bytes spill stores, 480 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 152 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 169 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 36 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 432 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 92 bytes spill stores, 220 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 88 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 224 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 172 bytes spill stores, 188 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 424 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 84 bytes spill stores, 140 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 160 bytes stack frame, 396 bytes spill stores, 416 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 104 bytes stack frame, 172 bytes spill stores, 256 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 264 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 224 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 248 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 32 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 76 bytes spill stores, 128 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 80 bytes stack frame, 100 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 252 bytes spill stores, 300 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 172 bytes spill stores, 236 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 392 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 52 bytes spill stores, 96 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [19/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 104 bytes stack frame, 176 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 88 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 296 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 224 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 240 bytes stack frame, 392 bytes spill stores, 480 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 152 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 169 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 62 bytes spill stores, 62 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 36 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 536 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 298 bytes spill stores, 630 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 136 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 208 bytes stack frame, 410 bytes spill stores, 568 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 172 bytes spill stores, 188 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 152 bytes stack frame, 172 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 240 bytes stack frame, 438 bytes spill stores, 574 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 104 bytes stack frame, 172 bytes spill stores, 256 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 264 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 224 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 248 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 54 bytes spill stores, 54 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 32 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 504 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 288 bytes spill stores, 492 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 132 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 192 bytes stack frame, 388 bytes spill stores, 470 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 172 bytes spill stores, 236 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 152 bytes stack frame, 174 bytes spill stores, 234 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 224 bytes stack frame, 418 bytes spill stores, 588 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 144 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 264 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 228 bytes spill stores, 260 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 62 bytes spill stores, 62 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 392 bytes spill stores, 680 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 132 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 192 bytes stack frame, 388 bytes spill stores, 470 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 52 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 258 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 224 bytes stack frame, 418 bytes spill stores, 588 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [20/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 116 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 184 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 195 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 156 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 128 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 212 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 592 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 428 bytes spill stores, 920 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 104 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 240 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 592 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 608 bytes spill stores, 852 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 204 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 568 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 388 bytes spill stores, 908 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 192 bytes spill stores, 180 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 568 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 552 bytes spill stores, 828 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 164 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 616 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 432 bytes spill stores, 912 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 108 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 176 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 616 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 484 bytes spill stores, 788 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 552 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 324 bytes spill stores, 720 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 560 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 120 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 160 bytes spill stores, 140 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 176 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 552 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 364 bytes spill stores, 800 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 108 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 184 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 616 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 484 bytes spill stores, 788 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 212 bytes spill stores, 240 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 296 bytes spill stores, 680 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 560 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] [21/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 116 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 184 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 195 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 156 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 128 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 212 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 664 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 774 bytes spill stores, 1342 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 240 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 664 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 846 bytes spill stores, 1098 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 52 bytes spill stores, 62 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 44 bytes spill stores, 106 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 204 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 640 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 678 bytes spill stores, 1222 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 192 bytes spill stores, 180 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 248 bytes stack frame, 462 bytes spill stores, 614 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 40 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 116 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 164 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 728 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 720 bytes spill stores, 1372 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 176 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 752 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 768 bytes spill stores, 1172 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 16 bytes spill stores, 50 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 688 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 698 bytes spill stores, 1186 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 466 bytes spill stores, 450 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 120 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 160 bytes spill stores, 140 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 120 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 176 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 728 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 728 bytes spill stores, 1288 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 184 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 752 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 768 bytes spill stores, 1172 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 16 bytes spill stores, 50 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 212 bytes spill stores, 240 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 648 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 658 bytes spill stores, 1138 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 466 bytes spill stores, 450 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [22/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [23/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 32 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 56 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 94 bytes spill stores, 114 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 48 bytes spill stores, 70 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 88 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 40 bytes stack frame, 40 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 68 bytes spill stores, 94 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 106 bytes spill stores, 138 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 96 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 52 bytes spill stores, 86 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 200 bytes stack frame, 78 bytes spill stores, 164 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 72 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 94 bytes spill stores, 114 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 44 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 88 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [24/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 88 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 84 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 100 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 84 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 32 bytes stack frame, 44 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 88 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 40 bytes stack frame, 44 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 32 bytes stack frame, 40 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 104 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 84 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 32 bytes stack frame, 44 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 112 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 116 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 40 bytes stack frame, 44 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 100 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 84 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 152 bytes spill stores, 172 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 32 bytes stack frame, 40 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [25/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 104 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 264 bytes stack frame, 292 bytes spill stores, 280 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 224 bytes stack frame, 108 bytes spill stores, 204 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 112 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 128 bytes stack frame, 200 bytes spill stores, 266 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 184 bytes spill stores, 160 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 84 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 208 bytes stack frame, 246 bytes spill stores, 316 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 272 bytes spill stores, 272 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 88 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 62 bytes spill stores, 54 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 272 bytes stack frame, 136 bytes spill stores, 238 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 192 bytes stack frame, 256 bytes spill stores, 292 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 82 bytes spill stores, 86 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 104 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 276 bytes spill stores, 276 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 58 bytes spill stores, 50 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 224 bytes stack frame, 108 bytes spill stores, 204 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 88 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 184 bytes spill stores, 160 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 40 bytes stack frame, 50 bytes spill stores, 42 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 92 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 272 bytes spill stores, 272 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [26/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 128 bytes stack frame, 176 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 132 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 160 bytes stack frame, 236 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 376 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 36 bytes spill stores, 60 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 148 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 280 bytes spill stores, 288 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 168 bytes stack frame, 220 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 68 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 224 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 344 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 216 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 144 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 88 bytes stack frame, 148 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 72 bytes spill stores, 168 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 176 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 56 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 72 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 184 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 376 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 36 bytes spill stores, 80 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 192 bytes stack frame, 326 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 132 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 160 bytes stack frame, 236 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 352 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 148 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 280 bytes spill stores, 288 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 68 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 224 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 336 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [27/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 140 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 120 bytes stack frame, 240 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 114 bytes spill stores, 158 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 248 bytes stack frame, 224 bytes spill stores, 280 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 190 bytes spill stores, 278 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 168 bytes stack frame, 128 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 304 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 112 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 264 bytes stack frame, 260 bytes spill stores, 310 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 488 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 166 bytes spill stores, 250 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 284 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 140 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 236 bytes spill stores, 204 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 272 bytes stack frame, 96 bytes spill stores, 160 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 280 bytes stack frame, 252 bytes spill stores, 296 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 74 bytes spill stores, 98 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 504 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 212 bytes spill stores, 284 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 96 bytes stack frame, 160 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 284 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 68 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 296 bytes stack frame, 236 bytes spill stores, 312 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 440 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 144 bytes spill stores, 192 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 280 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 124 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 244 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 272 bytes stack frame, 96 bytes spill stores, 160 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 280 bytes stack frame, 252 bytes spill stores, 296 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 66 bytes spill stores, 74 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 456 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 140 bytes spill stores, 176 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 96 bytes stack frame, 160 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 284 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 264 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 68 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 296 bytes stack frame, 236 bytes spill stores, 312 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 424 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 108 bytes spill stores, 156 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 280 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [28/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 116 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 496 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 188 bytes spill stores, 244 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 304 bytes stack frame, 346 bytes spill stores, 328 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 736 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 488 bytes spill stores, 464 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 64 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 536 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 228 bytes spill stores, 276 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 640 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 392 bytes spill stores, 396 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 152 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 576 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 288 bytes spill stores, 352 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 304 bytes stack frame, 346 bytes spill stores, 328 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 736 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 488 bytes spill stores, 464 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 632 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 332 bytes spill stores, 380 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 632 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 388 bytes spill stores, 392 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 92 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 60 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 152 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 36 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 512 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 224 bytes spill stores, 268 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 80 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 280 bytes stack frame, 316 bytes spill stores, 300 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 848 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 632 bytes spill stores, 672 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 592 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 116 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 612 bytes spill stores, 652 bytes spill loads [29/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 92 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 60 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 152 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 608 bytes stack frame, 384 bytes spill stores, 448 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 36 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1416 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1016 bytes spill stores, 1262 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 80 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 720 bytes stack frame, 580 bytes spill stores, 552 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1424 bytes spill stores, 1728 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1400 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1128 bytes spill stores, 1322 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 120 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 784 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 696 bytes stack frame, 476 bytes spill stores, 544 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1384 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1048 bytes spill stores, 1330 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 728 bytes stack frame, 664 bytes spill stores, 636 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1336 bytes spill stores, 1614 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1044 bytes spill stores, 1272 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 184 bytes spill stores, 176 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 688 bytes stack frame, 492 bytes spill stores, 574 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 116 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1376 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 916 bytes spill stores, 1144 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 728 bytes stack frame, 664 bytes spill stores, 636 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1340 bytes spill stores, 1618 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 64 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1368 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1008 bytes spill stores, 1182 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 184 bytes spill stores, 176 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [30/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 224 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 248 bytes stack frame, 412 bytes spill stores, 372 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 152 bytes stack frame, 192 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 264 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 272 bytes stack frame, 448 bytes spill stores, 476 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 44 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 132 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 176 bytes stack frame, 328 bytes spill stores, 348 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 208 bytes spill stores, 180 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 208 bytes stack frame, 340 bytes spill stores, 356 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 340 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 100 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 168 bytes stack frame, 244 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 240 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 128 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 184 bytes stack frame, 296 bytes spill stores, 288 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 44 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 332 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 232 bytes stack frame, 360 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 340 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 96 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 168 bytes stack frame, 244 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 240 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 40 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 128 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 184 bytes stack frame, 296 bytes spill stores, 288 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 332 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [31/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 208 bytes stack frame, 340 bytes spill stores, 356 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 340 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 100 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 168 bytes stack frame, 244 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 240 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 232 bytes stack frame, 176 bytes spill stores, 222 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 176 bytes stack frame, 196 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 240 bytes stack frame, 348 bytes spill stores, 344 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 384 bytes stack frame, 424 bytes spill stores, 504 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 216 bytes stack frame, 238 bytes spill stores, 314 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 264 bytes stack frame, 556 bytes spill stores, 496 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 224 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 248 bytes stack frame, 412 bytes spill stores, 372 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 152 bytes stack frame, 192 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 264 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 272 bytes stack frame, 448 bytes spill stores, 476 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 208 bytes stack frame, 166 bytes spill stores, 174 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 192 bytes stack frame, 200 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 200 bytes stack frame, 246 bytes spill stores, 226 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 424 bytes stack frame, 536 bytes spill stores, 610 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 176 bytes stack frame, 172 bytes spill stores, 256 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 176 bytes stack frame, 370 bytes spill stores, 318 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 232 bytes stack frame, 360 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 340 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 96 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 168 bytes stack frame, 244 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 240 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 232 bytes stack frame, 176 bytes spill stores, 222 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 184 bytes stack frame, 228 bytes spill stores, 286 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 240 bytes stack frame, 348 bytes spill stores, 344 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 384 bytes stack frame, 424 bytes spill stores, 504 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 208 bytes spill stores, 276 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 264 bytes stack frame, 556 bytes spill stores, 496 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [32/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 155 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 504 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 164 bytes spill stores, 276 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 196 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 568 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 316 bytes spill stores, 476 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 496 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 192 bytes spill stores, 280 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 296 bytes spill stores, 320 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 145 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 130 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 140 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 544 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 256 bytes spill stores, 464 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 148 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 120 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 344 bytes spill stores, 404 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 480 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 200 bytes spill stores, 368 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 160 bytes stack frame, 284 bytes spill stores, 268 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 136 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 84 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 400 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 72 bytes spill stores, 120 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 196 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 568 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 316 bytes spill stores, 476 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 360 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 24 bytes spill stores, 36 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 296 bytes spill stores, 320 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 152 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 138 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 688 bytes cmem[0] [33/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 155 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 296 bytes stack frame, 222 bytes spill stores, 266 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 752 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 588 bytes spill stores, 800 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 344 bytes stack frame, 248 bytes spill stores, 320 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 488 bytes stack frame, 542 bytes spill stores, 644 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 792 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 612 bytes spill stores, 752 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 432 bytes stack frame, 464 bytes spill stores, 588 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 136 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 272 bytes stack frame, 194 bytes spill stores, 254 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 84 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 768 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 524 bytes spill stores, 692 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 344 bytes stack frame, 248 bytes spill stores, 320 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 488 bytes stack frame, 542 bytes spill stores, 644 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 696 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 432 bytes stack frame, 464 bytes spill stores, 588 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 152 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 145 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 130 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 264 bytes stack frame, 214 bytes spill stores, 214 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 140 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 776 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 510 bytes spill stores, 726 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 408 bytes stack frame, 368 bytes spill stores, 360 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 120 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 408 bytes stack frame, 530 bytes spill stores, 608 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 680 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 416 bytes spill stores, 560 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 376 bytes stack frame, 460 bytes spill stores, 554 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers [34/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 48 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 48 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [35/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 48 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 48 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1536 bytes stack frame, 16 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] [36/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 197 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 199 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 183 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 199 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 169 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 197 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 175 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] [37/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 169 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 197 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 175 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 183 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 197 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 175 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 177 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 183 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 199 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers [38/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [39/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] [40/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 232 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 208 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 232 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [41/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers [42/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1632 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1632 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 48 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 48 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1664 bytes stack frame, 64 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1664 bytes stack frame, 60 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1632 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1632 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [43/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] [44/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 48 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 48 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [45/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] [46/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 172 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 172 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 207 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] [47/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 181 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 205 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 172 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 172 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers [48/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 80 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 52 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 32 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 76 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2128 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 64 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 2144 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb1ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 116 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 2144 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb1ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 100 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2112 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb1ELb0ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 64 bytes spill stores, 76 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2096 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb1ELb0ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 36 bytes spill stores, 68 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 144 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1840 bytes stack frame, 128 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1824 bytes stack frame, 120 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 4 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2080 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2064 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 8 bytes spill stores, 32 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 2096 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb1ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 68 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 2112 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb1ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 64 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2080 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb1ELb0ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 28 bytes spill stores, 40 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2064 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb1ELb0ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 8 bytes spill stores, 36 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 152 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 28 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 40 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2064 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2096 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 48 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 2096 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb1ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 68 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 2112 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb1ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 64 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 56 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2080 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb1ELb0ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 28 bytes spill stores, 40 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2064 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb1ELb0ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 8 bytes spill stores, 36 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 152 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 28 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 40 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [49/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2064 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2112 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 48 bytes spill stores, 72 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 24 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 24 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 152 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 28 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 40 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 80 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2112 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2144 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 88 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1824 bytes stack frame, 112 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1824 bytes stack frame, 112 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1760 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1760 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 144 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1840 bytes stack frame, 128 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1824 bytes stack frame, 120 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 2080 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 2064 bytes stack frame, 8 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash30compute_attn_1rowblock_splitkvI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES3_EELb0ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iiiii 0 bytes stack frame, 8 bytes spill stores, 32 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 24 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 24 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 56 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1728 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 152 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 28 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi64ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 1744 bytes stack frame, 40 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi16EEESC_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjSG_EEEEES7_SJ_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SC_EEENS9_IJNS9_IJSG_SC_EEEiSG_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSU_ISE_Li0EEESG_EEEEEEEvRKNS1_6TensorIT1_T2_EERNS10_IT3_T4_EERKNS10_IT5_T6_EES1E_RKNS10_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm128ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi32EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm32ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_S9_EEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS7_ILi8EEENS7_ILi16EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_S9_EEENS6_IJSD_SF_NS6_IJNS7_ILi128EEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSP_SA_EEESA_S9_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJiiEEEEEEEEEENS2_IS16_NS5_INS6_IJS17_SQ_S9_EEENS6_IJS19_NS7_ILi1024EEES1A_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1T_S1T_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SP_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJSX_SA_SQ_EEENS6_IJNS6_IJNS7_ILi64EEESP_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJS25_SQ_EEEEENS1W_IS1Z_NS5_INS6_IJS20_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSB_SA_SE_EEENS6_IJNS6_IJSQ_SX_EEENS6_IJSP_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2D_iEENS2P_IS2O_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads g++ -shared -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -Wl,--build-id=sha1 -g -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -Wl,--build-id=sha1 -g -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.o -L/usr/lib64/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.1/lib64 -L/usr/lib64 -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -lpython3.10 -o build/lib.linux-aarch64-cpython-310/xformers/_C_flashattention.so building 'xformers._C' extension creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/autograd creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24 creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/cuda Emitting ninja build file /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/build.ninja... Compiling objects... Using envvar MAX_JOBS (4) as the number of workers... [1/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/attention.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/attention.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/attention.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [2/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/matmul.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cpu/matmul.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/matmul.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [3/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/sddmm.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cpu/sddmm.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/sddmm.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [4/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/autograd/matmul.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/autograd/matmul.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/autograd/matmul.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [5/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/spmm.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cpu/spmm.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/spmm.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [6/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/sparse_softmax.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cpu/sparse_softmax.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/sparse_softmax.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [7/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/attention_cutlass_rand_uniform.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/attention_cutlass_rand_uniform.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_50' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 392 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_60' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 392 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_61' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 392 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_70' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 43 registers, 424 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_80' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 424 bytes cmem[0] ptxas info : 218049 bytes gmem ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_90' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_52' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 392 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_86' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 424 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l' for 'sm_75' ptxas info : Function properties for _ZN74_GLOBAL__N__f7eb95ba_33_attention_cutlass_rand_uniform_cu_ac8d307b_128894719rand_uniform_kernelIfEEvlllfN2at15PhiloxCudaStateEPT_l 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 424 bytes cmem[0] [8/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/attention_forward_generic.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/attention_forward_generic.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218050 bytes gmem, 72 bytes cmem[3] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218050 bytes gmem, 72 bytes cmem[3] ptxas info : 218050 bytes gmem, 72 bytes cmem[3] ptxas info : 218050 bytes gmem, 72 bytes cmem[3] ptxas info : 218050 bytes gmem, 72 bytes cmem[3] ptxas info : 218050 bytes gmem, 112 bytes cmem[3] ptxas info : 218050 bytes gmem ptxas info : 218050 bytes gmem, 112 bytes cmem[3] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218050 bytes gmem, 72 bytes cmem[3] [9/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k128.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k128.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218484 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218484 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218484 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218484 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218484 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218484 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218250 bytes gmem ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 40 bytes stack frame, 52 bytes spill stores, 88 bytes spill loads ptxas info : Used 128 registers ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 24 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads ptxas info : Used 128 registers ptxas info : 218250 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 24 bytes stack frame, 32 bytes spill stores, 60 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0] ptxas info : 218250 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 165 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 165 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 40 bytes stack frame, 48 bytes spill stores, 88 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z55fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 24 bytes stack frame, 28 bytes spill stores, 52 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0] [10/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/attention_backward_generic.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/attention_backward_generic.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218050 bytes gmem, 72 bytes cmem[3] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218050 bytes gmem, 72 bytes cmem[3] ptxas info : 218050 bytes gmem, 72 bytes cmem[3] ptxas info : 218050 bytes gmem, 72 bytes cmem[3] ptxas info : 218050 bytes gmem, 72 bytes cmem[3] ptxas info : 218050 bytes gmem, 112 bytes cmem[3] ptxas info : 218050 bytes gmem ptxas info : 218050 bytes gmem, 112 bytes cmem[3] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218050 bytes gmem, 72 bytes cmem[3] [11/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k128_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k128_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218272 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218272 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218272 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218272 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218272 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218272 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218150 bytes gmem ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 208 bytes stack frame, 220 bytes spill stores, 310 bytes spill loads ptxas info : Used 128 registers ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 208 bytes stack frame, 240 bytes spill stores, 278 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z50fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 184 bytes stack frame, 212 bytes spill stores, 242 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0], 8 bytes cmem[2] [12/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k32.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k32.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 151 registers, 784 bytes cmem[0] ptxas info : 218150 bytes gmem ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers [13/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k32_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k32_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218100 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218100 bytes gmem ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 168 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218100 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 784 bytes cmem[0], 8 bytes cmem[2] [14/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k64.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k64.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218263 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218150 bytes gmem ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_bf16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0] [15/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k64_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k64_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218100 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218100 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218100 bytes gmem ptxas info : Compiling entry function '_Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 168 registers [16/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k65536.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k65536.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218259 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218259 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218259 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218259 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218259 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218259 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers, 784 bytes cmem[0] ptxas info : 218150 bytes gmem ptxas info : Compiling entry function '_Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_bf16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 152 registers ptxas info : Compiling entry function '_Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassB_bf16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers [17/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k65536_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k65536_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218275 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218275 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218275 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218275 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218275 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218275 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218150 bytes gmem ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 161 registers ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 784 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218150 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 157 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z53fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 784 bytes cmem[0], 8 bytes cmem[2] [18/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k96.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k96.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218152 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218152 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218152 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218152 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218152 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218152 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218100 bytes gmem ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : 218100 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 784 bytes cmem[0] ptxas info : 218100 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassB_bf16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_10bfloat16_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 784 bytes cmem[0] [19/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k128.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k128.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 219159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 752 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 219104 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218989 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 784 bytes cmem[0] ptxas info : 219159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 752 bytes cmem[0], 24 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218975 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 165 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 165 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 32 bytes stack frame, 40 bytes spill stores, 60 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 24 bytes stack frame, 32 bytes spill stores, 60 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 219159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 199 registers, 752 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218975 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 219159 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 752 bytes cmem[0], 24 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218975 bytes gmem ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_128x128_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb0EE6ParamsE 40 bytes stack frame, 48 bytes spill stores, 84 bytes spill loads ptxas info : Used 128 registers ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassB_f16_aligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z54fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi128ELi128ELb1EE6ParamsE 24 bytes stack frame, 32 bytes spill stores, 44 bytes spill loads ptxas info : Used 128 registers ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z53fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi128ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers [20/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k32.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k32.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218623 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 752 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218623 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218623 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218563 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 784 bytes cmem[0] ptxas info : 218619 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218623 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 752 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218557 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218557 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218557 bytes gmem ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 151 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi32ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi32ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers [21/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k128_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k128_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218758 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 752 bytes cmem[0], 44 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218758 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 752 bytes cmem[0], 44 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218703 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218695 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218692 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 128 bytes stack frame, 160 bytes spill stores, 198 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218758 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 752 bytes cmem[0], 56 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218692 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 104 bytes stack frame, 136 bytes spill stores, 170 bytes spill loads ptxas info : Used 128 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218692 bytes gmem ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 112 bytes stack frame, 140 bytes spill stores, 234 bytes spill loads ptxas info : Used 128 registers ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218758 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 752 bytes cmem[0], 56 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi128ELi128ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z50fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] [22/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k32_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k32_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 752 bytes cmem[0], 36 bytes cmem[2] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 752 bytes cmem[0], 36 bytes cmem[2] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 48 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 752 bytes cmem[0], 48 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218422 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 48 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 752 bytes cmem[0], 48 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218421 bytes gmem ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 165 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218421 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218421 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [23/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k64_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k64_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218422 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 752 bytes cmem[0], 36 bytes cmem[2] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 752 bytes cmem[0], 36 bytes cmem[2] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218421 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218421 bytes gmem ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 165 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 48 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 752 bytes cmem[0], 48 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 48 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 752 bytes cmem[0], 48 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218421 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb1ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [24/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k64.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k64.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218619 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218623 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 752 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218563 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218557 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218623 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 752 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218557 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 148 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218623 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218623 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218557 bytes gmem ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 152 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f16_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi64ELi64ELi64ELb1EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi64ELb1EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers [25/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k65536.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k65536.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218721 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 752 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218721 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 752 bytes cmem[0], 24 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218672 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 784 bytes cmem[0] ptxas info : 218664 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 157 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218721 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 752 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218721 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 752 bytes cmem[0], 24 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218662 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218662 bytes gmem ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 152 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 152 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218662 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 156 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 156 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f16_aligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [26/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k65536_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k65536_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218769 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 752 bytes cmem[0], 44 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218769 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 80 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 752 bytes cmem[0], 56 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218712 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218702 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218769 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 80 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 752 bytes cmem[0], 56 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218702 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218769 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 752 bytes cmem[0], 44 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218702 bytes gmem ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218704 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [27/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k96.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k96.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218151 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218151 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218151 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218151 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218151 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218151 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218100 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 784 bytes cmem[0] ptxas info : 218100 bytes gmem ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : 218100 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f16_aligned_128x64_k96_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80ENS0_6half_tELb1ELb0ELb1ELi128ELi64ELi96ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 784 bytes cmem[0] [28/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k128.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k128.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218567 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218567 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218567 bytes gmem ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218518 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 199 registers, 752 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218518 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 752 bytes cmem[0], 24 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218518 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 752 bytes cmem[0], 24 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218518 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 752 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218468 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218460 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f16_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassB_f16_notaligned_128x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [29/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k32.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k32.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218304 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218356 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 752 bytes cmem[0], 4 bytes cmem[2] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 218356 bytes gmem ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218356 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 752 bytes cmem[0], 4 bytes cmem[2] [30/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k128_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k128_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218550 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 752 bytes cmem[0], 44 bytes cmem[2] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218550 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 752 bytes cmem[0], 44 bytes cmem[2] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218607 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218607 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218607 bytes gmem ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218492 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218550 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 752 bytes cmem[0], 56 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218550 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 752 bytes cmem[0], 56 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218484 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z53fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [31/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k32_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k32_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 752 bytes cmem[0], 36 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 48 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 752 bytes cmem[0], 48 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 218380 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218380 bytes gmem ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 48 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 752 bytes cmem[0], 48 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 218380 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218320 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 752 bytes cmem[0], 36 bytes cmem[2] [32/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k64.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k64.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 752 bytes cmem[0], 4 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218356 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 218356 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218356 bytes gmem ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218304 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f16_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 752 bytes cmem[0], 4 bytes cmem[2] [33/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k64_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k64_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 752 bytes cmem[0], 36 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 752 bytes cmem[0], 36 bytes cmem[2] ptxas info : 218380 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218380 bytes gmem ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218380 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 48 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 752 bytes cmem[0], 48 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218320 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 48 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 752 bytes cmem[0], 48 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [34/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k65536.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k65536.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218577 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218577 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218526 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 752 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218526 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 752 bytes cmem[0], 24 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218577 bytes gmem ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218474 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 784 bytes cmem[0] ptxas info : 218466 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 157 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218526 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 752 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218526 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z46fmha_cutlassB_f16_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 752 bytes cmem[0], 24 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z47fmha_cutlassB_f16_notaligned_128x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] [35/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k65536_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k65536_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218558 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 752 bytes cmem[0], 44 bytes cmem[2] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218558 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 752 bytes cmem[0], 44 bytes cmem[2] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218558 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 80 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 752 bytes cmem[0], 56 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218558 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 80 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 752 bytes cmem[0], 56 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218617 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218617 bytes gmem ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218617 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218498 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218490 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z54fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50ENS0_6half_tELb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z55fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70ENS0_6half_tELb0ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [36/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k128.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k128.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218507 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 182 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218507 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218507 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218507 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218507 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 181 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218507 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218450 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 784 bytes cmem[0] ptxas info : 218450 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0] ptxas info : 218450 bytes gmem ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassB_f32_aligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassB_f32_aligned_128x64_k128_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers [37/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k32.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k32.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 157 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218397 bytes gmem ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218397 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218397 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [38/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k128_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k128_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218539 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218539 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 784 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218539 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218539 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218539 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218539 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218474 bytes gmem ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : 218474 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 784 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218474 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z49fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z50fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi128ELb0EE6ParamsE 88 bytes stack frame, 100 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 784 bytes cmem[0], 8 bytes cmem[2] [39/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k32_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k32_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 752 bytes cmem[0], 40 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218421 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218421 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218421 bytes gmem ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers [40/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k64.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k64.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 157 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218397 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218397 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218397 bytes gmem ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218402 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassB_f32_aligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] [41/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k64_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k64_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218421 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218421 bytes gmem ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218421 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 752 bytes cmem[0], 40 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218426 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z48fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [42/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k128.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k128.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218359 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218359 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218310 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 181 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218310 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218359 bytes gmem ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218310 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218310 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218310 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 182 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218310 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f32_notaligned_64x64_k128_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] [43/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k65536.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k65536.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218515 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218515 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 181 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218515 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218515 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218456 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 784 bytes cmem[0] ptxas info : 218515 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218515 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 182 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218456 bytes gmem ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218456 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_aligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z44fmha_cutlassB_f32_aligned_128x64_k65536_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb0ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 32 bytes stack frame, 48 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 784 bytes cmem[0] [44/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k32.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k32.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218356 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 157 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218356 bytes gmem ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218356 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k32_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] [45/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k65536_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k65536_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218547 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218547 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218547 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218547 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218547 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : 218547 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 784 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218480 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218480 bytes gmem ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218480 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb1ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80N23AttentionBackwardKernelIN7cutlass4arch4Sm80EfLb1ELb1ELb0ELi128ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 784 bytes cmem[0], 8 bytes cmem[2] [46/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k128_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k128_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218326 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218326 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218326 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218326 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218383 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218383 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218383 bytes gmem ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218326 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218326 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 784 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z52fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi128ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] [47/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k32_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k32_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 752 bytes cmem[0], 40 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218380 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218380 bytes gmem ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218380 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi32ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 752 bytes cmem[0], 40 bytes cmem[2] [48/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k64.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k64.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 157 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218356 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218356 bytes gmem ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218356 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 784 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218308 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassB_f32_notaligned_64x64_k64_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 752 bytes cmem[0], 8 bytes cmem[2] [49/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k65536.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k65536.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218314 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 182 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218314 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 181 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218314 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218365 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218365 bytes gmem ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218365 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218314 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218314 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218314 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z46fmha_cutlassB_f32_notaligned_64x64_k65536_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb0ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 752 bytes cmem[0], 8 bytes cmem[2] [50/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k64_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k64_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 752 bytes cmem[0], 40 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218380 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218380 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218380 bytes gmem ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 752 bytes cmem[0], 40 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers, 784 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218324 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z51fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi64ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 752 bytes cmem[0], 40 bytes cmem[2] [51/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_bf16_aligned.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_bf16_aligned.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218354 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : 218354 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218354 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : 218354 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : 218354 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218354 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : 218062 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 8 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218062 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 122 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : 218062 bytes gmem ptxas info : Compiling entry function '_Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassF_bf16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 125 registers ptxas info : Compiling entry function '_Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z41fmha_cutlassF_bf16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 124 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_bf16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass10bfloat16_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 122 registers [52/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k65536_dropout.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k65536_dropout.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218330 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218330 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218330 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218389 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218389 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218389 bytes gmem ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218330 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 784 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218330 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 784 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 784 bytes cmem[0] ptxas info : 218330 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75N23AttentionBackwardKernelIN7cutlass4arch4Sm75EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70N23AttentionBackwardKernelIN7cutlass4arch4Sm70EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 752 bytes cmem[0] ptxas info : Compiling entry function '_Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z54fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50N23AttentionBackwardKernelIN7cutlass4arch4Sm50EfLb0ELb1ELb0ELi64ELi64ELi65536ELb0EE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 752 bytes cmem[0], 40 bytes cmem[2] [53/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f16_notaligned.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f16_notaligned.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 155 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 112 bytes stack frame, 128 bytes spill stores, 120 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 56 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218971 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218971 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 153 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218971 bytes gmem ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 140 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 144 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 161 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassF_f16_notaligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 140 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassF_f16_notaligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f16_notaligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [54/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f16_aligned.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f16_aligned.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 156 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218956 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 144 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 161 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218956 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 122 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 140 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218956 bytes gmem ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 125 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 122 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 122 registers ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 72 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 32 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 140 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers, 568 bytes cmem[0], 52 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f16_aligned_32x128_gmem_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 153 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_f16_aligned_32x128_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm80N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm75N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm70N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z39fmha_cutlassF_f16_aligned_64x64_rf_sm50N15AttentionKernelIN7cutlass6half_tENS0_4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] [55/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f32_notaligned.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f32_notaligned.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 148 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 143 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 148 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218971 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218971 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218971 bytes gmem ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 148 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 568 bytes cmem[0], 40 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218676 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z45fmha_cutlassF_f32_notaligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z43fmha_cutlassF_f32_notaligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f32_notaligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb0ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] [56/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f32_aligned.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f32_aligned.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_70' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 24 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_75' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 148 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_52' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 143 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_60' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 148 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_61' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218959 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 148 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 568 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_50' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 568 bytes cmem[0], 40 bytes cmem[2] ptxas info : 218956 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 112 bytes stack frame, 144 bytes spill stores, 184 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 208 bytes stack frame, 380 bytes spill stores, 372 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_86' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : 218956 bytes gmem, 112 bytes cmem[3] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 168 bytes stack frame, 192 bytes spill stores, 252 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 600 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 264 bytes stack frame, 436 bytes spill stores, 436 bytes spill loads ptxas info : Used 168 registers, 600 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_80' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 600 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 218956 bytes gmem ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 112 bytes stack frame, 128 bytes spill stores, 172 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z42fmha_cutlassF_f32_aligned_32x128_gmem_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi65536ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_64x128_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z40fmha_cutlassF_f32_aligned_32x128_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi32ELi128ELi128ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm80N15AttentionKernelIfN7cutlass4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 232 bytes stack frame, 404 bytes spill stores, 400 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm75N15AttentionKernelIfN7cutlass4arch4Sm75ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm70N15AttentionKernelIfN7cutlass4arch4Sm70ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers ptxas info : Compiling entry function '_Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE' for 'sm_90' ptxas info : Function properties for _Z39fmha_cutlassF_f32_aligned_64x64_rf_sm50N15AttentionKernelIfN7cutlass4arch4Sm50ELb1ELi64ELi64ELi64ELb1ELb1E18DefaultToBatchHookE6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers [57/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/decoder.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/decoder.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : 17 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 644 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 43 registers, 644 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 43 registers, 644 bytes cmem[0] ptxas info : 17 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 58 registers, 644 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 57 registers, 644 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 58 registers, 644 bytes cmem[0] ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 8 registers, 612 bytes cmem[0] ptxas info : 17 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : 17 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 644 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 644 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 644 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 17 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIfLi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS1_17RestrictPtrTraitsEiEENS2_IS3_Lm5ES4_lEES6_S5_NS2_IiLm1ES4_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 644 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c108BFloat16ELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 49 registers, 644 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__012075ed_10_decoder_cu_ac8d307b_135543815mqa_attn_kernelIN3c104HalfELi0EEEvN2at27GenericPackedTensorAccessorIT_Lm5ENS3_17RestrictPtrTraitsEiEENS4_IS5_Lm5ES6_lEES8_S7_NS4_IiLm1ES6_iEEf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 49 registers, 644 bytes cmem[0] [58/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/matmul.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_70' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 528 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_70' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 528 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_70' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 528 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_50' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_50' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_50' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_61' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_61' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_61' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_60' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_60' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_60' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_86' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 528 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_86' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 528 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_86' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 528 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_75' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 528 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_75' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 528 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_75' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 528 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_90' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_90' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_90' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_80' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 528 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_80' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 528 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_80' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 528 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE' for 'sm_52' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIN3c104HalfEEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS3_16DefaultPtrTraitsElEENS4_IS5_Lm3ES6_lEES8_NS4_IlLm2ES6_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_52' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIfEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] ptxas info : Compiling entry function '_ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE' for 'sm_52' ptxas info : Function properties for _ZN49_GLOBAL__N__d0da1e2e_9_matmul_cu_ac8d307b_135556830matmul_with_sparse_mask_kernelIdEEvN2at27GenericPackedTensorAccessorIT_Lm1ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm3ES4_lEES6_NS2_IlLm2ES4_lEE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 496 bytes cmem[0] /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu: In lambda function: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:1122: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 1; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:1160: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:1199: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:1245: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = long int; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu: In lambda function: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:2177: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 1; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:2214: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:2252: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:2298: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = long int; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu: In lambda function: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:3223: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = c10::Half; long unsigned int N = 1; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:3264: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = c10::Half; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:3306: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = c10::Half; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/matmul.cu:92:3352: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = long int; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 92 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ [59/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/sddmm.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/sddmm.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_52' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 128 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_52' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 256 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_52' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 512 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_52' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 512 bytes smem, 388 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_70' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 128 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_70' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 256 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_70' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 512 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_70' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 512 bytes smem, 420 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_75' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 128 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_75' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 256 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_75' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 138 registers, 512 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_75' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 512 bytes smem, 420 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_50' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 128 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_50' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 256 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_50' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 512 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_50' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 512 bytes smem, 388 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_61' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 128 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_61' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 256 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_61' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 512 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_61' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 512 bytes smem, 388 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_86' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 128 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_86' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 115 registers, 256 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_86' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 132 registers, 512 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_86' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 130 registers, 512 bytes smem, 420 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_60' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 128 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_60' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 256 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_60' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 512 bytes smem, 388 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_60' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 512 bytes smem, 388 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_90' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 128 bytes smem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_90' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 256 bytes smem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_90' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 512 bytes smem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_90' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 512 bytes smem /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi' for 'sm_80' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2IfLi1ELi32ELi32ELi32ELi1EEEviiiPKiS3_S3_PKfS5_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 128 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_80' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float2Li2ELi32ELi32ELi16ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 256 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_80' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi1EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 512 bytes smem, 420 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi' for 'sm_80' ptxas info : Function properties for _ZN7sputnik40_GLOBAL__N__da0d0554_8_sddmm_cu_76d5abc216CudaSddmmKernel2I6float4Li4ELi32ELi32ELi8ELi0EEEviiiPKiS4_S4_PKfS6_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 512 bytes smem, 420 bytes cmem[0] [60/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/sddmm2_cuda.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/sddmm2_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_61' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_61' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_61' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_61' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_61' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 384 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_50' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_50' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_50' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_50' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_50' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 384 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_60' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_60' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_60' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_60' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_60' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 384 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_70' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_70' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_70' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_70' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_70' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 416 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_80' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_80' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_80' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_80' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_80' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 416 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_86' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_86' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_86' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_86' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_86' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 49 registers, 416 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_75' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_75' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_75' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_75' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 416 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_75' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 416 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_90' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_90' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_90' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_90' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_90' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_52' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_52' ptxas info : Function properties for _ZN7ge_spmm14sddmmCSR2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_52' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO1ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_52' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO2ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 384 bytes cmem[0] ptxas info : Compiling entry function '_ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_' for 'sm_52' ptxas info : Function properties for _ZN7ge_spmm14sddmmCOO4ScaleIfEEviiimPiS1_PT_S3_S3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 384 bytes cmem[0] [61/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/matmul.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/matmul.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/matmul.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [62/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/small_k.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 165 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 136 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 134 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_75' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 440 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 157 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 136 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 126 registers, 704 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 134 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_80' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 440 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 161 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_50' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 119 registers, 408 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 64 bytes stack frame, 128 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 704 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 153 registers, 704 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 165 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_70' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 120 registers, 440 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 4224 bytes smem, 928 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 129 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 127 registers, 704 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 136 registers, 704 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_86' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 440 bytes cmem[0] ptxas info : 218049 bytes gmem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 216 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 4224 bytes smem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 4224 bytes smem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 4224 bytes smem ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 135 registers ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 136 registers ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_90' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 161 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_61' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 119 registers, 408 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 161 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_52' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 119 registers, 408 bytes cmem[0] ptxas info : 218049 bytes gmem, 72 bytes cmem[3] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIffLi4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES5_S5_S5_S5_S5_S5_S5_NS2_IS3_Lm2ES4_lEES5_S3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float2Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546325attention_backward_kernelIf6float4Li4ELi8ELi32ELi32ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEES6_S6_S6_S6_S6_S6_S6_NS3_IS4_Lm2ES5_lEES6_S4_NS2_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 4224 bytes smem, 896 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 144 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 143 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb0EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 672 bytes cmem[0] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIffLi32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEENS2_IS3_Lm2ES4_lEES5_S5_S5_S5_S3_NS1_15PhiloxCudaStateE 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 144 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float2Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 128 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 142 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546316attention_kernelIf6float4Li32ELi2ELi4ELi8ELb1EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS2_16DefaultPtrTraitsElEENS3_IS4_Lm2ES5_lEES6_S6_S6_S6_S4_NS2_15PhiloxCudaStateE 256 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 144 registers, 672 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE' for 'sm_60' ptxas info : Function properties for _ZN51_GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_135546314dropout_kernelIffLi32ELi2ELi4EEEvN2at27GenericPackedTensorAccessorIT_Lm3ENS1_16DefaultPtrTraitsElEES3_NS1_15PhiloxCudaStateE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 119 registers, 408 bytes cmem[0] /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu: In function ‘at::Tensor _GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_1355463::_dropout_mask(at::Tensor, double)’: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1456:144: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1456 | dropout_kernel | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu: In instantiation of ‘void _GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_1355463::launch_attention(at::Tensor&, at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, float, at::PhiloxCudaState) [with bool compute_logsumexp = true]’: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:805:25: required from here /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:212: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:264: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:312: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:358: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:406: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:212: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:264: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:312: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:358: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:406: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:211: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:263: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:311: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:357: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:405: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu: In instantiation of ‘void _GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_1355463::launch_attention(at::Tensor&, at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, float, at::PhiloxCudaState) [with bool compute_logsumexp = false]’: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:808:26: required from here /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:212: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:264: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:312: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:358: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:668:406: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 668 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:212: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:264: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:312: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:358: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:688:406: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 688 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:211: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:263: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:311: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:357: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:709:405: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 709 | attention_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu: In instantiation of ‘void _GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_1355463::launch_attention_backward(at::Tensor&, at::Tensor&, at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, float, at::PhiloxCudaState) [with scalar_t = float; vec_t = float4]’: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1247:43: required from here /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:215: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:265: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:315: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:367: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:416: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:463: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:512: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:562: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:615: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:214: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:264: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:314: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:366: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:415: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:462: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:511: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:561: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:614: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu: In instantiation of ‘void _GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_1355463::launch_attention_backward(at::Tensor&, at::Tensor&, at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, float, at::PhiloxCudaState) [with scalar_t = float; vec_t = float2]’: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1261:43: required from here /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:215: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:265: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:315: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:367: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:416: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:463: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:512: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:562: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:615: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:214: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:264: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:314: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:366: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:415: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:462: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:511: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:561: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:614: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu: In instantiation of ‘void _GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_1355463::launch_attention_backward(at::Tensor&, at::Tensor&, at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, float, at::PhiloxCudaState) [with scalar_t = float; vec_t = float]’: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1275:42: required from here /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:215: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:265: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:315: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:367: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:416: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:463: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:512: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:562: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1115:615: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1115 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:214: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:264: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:314: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:366: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:415: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:462: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:511: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:561: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:1136:614: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 1136 | attention_backward_kernel< | ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu: In instantiation of ‘at::PackedTensorAccessor _GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_1355463::_packed_tensor_accessor_or_dummy(const at::Tensor&) [with scalar_t = float; at::PackedTensorAccessor = at::GenericPackedTensorAccessor]’: /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:662:66: required from ‘void _GLOBAL__N__26946adb_10_small_k_cu_ac8d307b_1355463::launch_attention(at::Tensor&, at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, float, at::PhiloxCudaState) [with bool compute_logsumexp = true]’ /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:805:25: required from here /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/fmha/small_k.cu:619:57: warning: ‘at::GenericPackedTensorAccessor at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 3; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations] 619 | return attn_bias.packed_accessor(); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ /usr/lib64/python3.10/site-packages/torch/include/ATen/core/TensorBody.h:253:1: note: declared here 253 | GenericPackedTensorAccessor packed_accessor() const & { | ^ ~~~~~~~~~~~~~ [63/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/sparse_softmax.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/sparse_softmax.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_70' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 18 registers, 404 bytes cmem[0] ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_70' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 18 registers, 412 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_75' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 16 registers, 404 bytes cmem[0] ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_75' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 14 registers, 412 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_60' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 14 registers, 372 bytes cmem[0] ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_60' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 13 registers, 380 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_50' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 14 registers, 372 bytes cmem[0] ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_50' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 13 registers, 380 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_80' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 18 registers, 404 bytes cmem[0] ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_80' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 18 registers, 412 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_90' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 20 registers ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_90' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 18 registers ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_52' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 14 registers, 372 bytes cmem[0] ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_52' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 13 registers, 380 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_86' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 18 registers, 404 bytes cmem[0] ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_86' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 17 registers, 412 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi' for 'sm_61' ptxas info : Function properties for _ZN7sputnik50_GLOBAL__N__2d4a0f7f_17_sparse_softmax_cu_4ff6710c19SparseSoftmaxKernelEiiPKfPKiS4_S4_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 14 registers, 372 bytes cmem[0] ptxas info : Compiling entry function '_Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi' for 'sm_61' ptxas info : Function properties for _Z27SparseSoftmaxBackwardKerneliiPKfS0_PKiS2_S2_Pfi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 13 registers, 380 bytes cmem[0] [64/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/sddmm.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/sddmm.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/sddmm.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [65/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/sparse_softmax.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/sparse_softmax.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/sparse_softmax.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [66/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/spmm.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/spmm.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/spmm.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [67/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/attention/cuda/spmm.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/spmm.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 108 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 108 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 99 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_60' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 108 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 108 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 99 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_61' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 106 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 99 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_70' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 108 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 108 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 99 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_52' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 102 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 102 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_75' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 108 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 163 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 108 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 1024 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 99 registers, 512 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_50' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers, 256 bytes smem, 396 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_86' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 99 registers, 512 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 256 bytes smem, 428 bytes cmem[0] ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_80' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 256 bytes smem, 428 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIfffLi1ELi32ELi32ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES5_PKNS6_11ScalarIndexES9_PKfPS7_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 256 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 512 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float2S3_Li2ELi32ELi32ELi16ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 122 registers, 512 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 182 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi64ELi8ELi4ELi0ELb1ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4S3_Li4ELi32ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 120 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float46float2Li4ELi32ELi16ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 106 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float4fLi4ELi32ELi8ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 1024 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 512 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIf6float26float4Li4ELi16ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES7_PKNS8_11ScalarIndexESB_PKfPS9_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 512 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 256 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li4ELi8ELi32ELi8ELi4ELi0ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 256 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c57Kernel2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 164 registers, 256 bytes smem ptxas info : Compiling entry function '_ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i' for 'sm_90' ptxas info : Function properties for _ZN7sputnik39_GLOBAL__N__345f6144_7_spmm_cu_4ffa47c517KernelWithBounds2INS_10SpmmConfigIff6float4Li1ELi32ELi128ELi32ELi4ELi1ELb0ELi8EEEEEviiiPKiPKNT_11ScalarValueES6_PKNS7_11ScalarIndexESA_PKfPS8_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 256 bytes smem [68/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/boxing_unboxing.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/boxing_unboxing.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/boxing_unboxing.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 In file included from /usr/lib64/python3.10/site-packages/torch/include/torch/csrc/utils/python_arg_parser.h:63, from /usr/lib64/python3.10/site-packages/torch/include/torch/csrc/jit/python/pybind_utils.h:26, from /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/boxing_unboxing.cpp:13: /usr/lib64/python3.10/site-packages/torch/include/torch/csrc/utils/python_strings.h:106:19: warning: ‘pybind11::object PyObject_FastGetAttrString(PyObject*, const char*)’ defined but not used [-Wunused-function] 106 | static py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) { | ^~~~~~~~~~~~~~~~~~~~~~~~~~ [69/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/sparse24.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [70/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/nvcc_info.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/nvcc_info.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : 1 bytes gmem /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : 1 bytes gmem ptxas info : 1 bytes gmem ptxas info : 1 bytes gmem ptxas info : 1 bytes gmem ptxas info : 1 bytes gmem /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem [71/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/meta_utils.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 2 bytes gmem ptxas info : 2 bytes gmem ptxas info : 2 bytes gmem /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 2 bytes gmem ptxas info : 2 bytes gmem ptxas info : 2 bytes gmem ptxas info : 2 bytes gmem ptxas info : 2 bytes gmem /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/meta_utils.cu(170): warning #177-D: variable "element" was declared but never referenced uint16_t element = 0; ^ Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 2 bytes gmem [72/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/sparse24_apply.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_apply.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 392 bytes cmem[0] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 392 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 392 bytes cmem[0] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 392 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 424 bytes cmem[0] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 424 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 392 bytes cmem[0] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 392 bytes cmem[0] ptxas info : 1 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, 424 bytes cmem[0] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, 424 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 101 registers, 424 bytes cmem[0] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 101 registers, 424 bytes cmem[0] ptxas info : 1 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 424 bytes cmem[0] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 424 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 392 bytes cmem[0] ptxas info : Compiling entry function '_ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN58_GLOBAL__N__45c6b998_17_sparse24_apply_cu_ac8d307b_135627521sparse24_apply_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 392 bytes cmem[0] [73/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/gemm.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/gemm.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 484 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 696 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 696 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 102 registers, 696 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 696 bytes cmem[0] ptxas info : 484 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 696 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 696 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 102 registers, 696 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 696 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 2 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 728 bytes cmem[0] ptxas info : 484 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 104 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 130 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 98 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 112 registers, 728 bytes cmem[0] ptxas info : 484 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 102 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 101 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 728 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers ptxas info : 484 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 102 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 101 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_20TileIteratorTensorOpIS1H_S1K_fS13_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_20TileIteratorTensorOpIS1H_S1K_fSX_EENS1W_18SharedLoadIteratorINS23_18CompactedThreadMapEfLi32EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_20TileIteratorTensorOpIS1I_S1L_fSF_EENS1X_18SharedLoadIteratorINS24_18CompactedThreadMapEfLi32EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi64ELi128EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ESY_Li16EEELSV_1EfNSE_8RowMajorENSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S11_fS13_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_S13_SD_SF_fS13_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEES13_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi256ELi32EEELi256ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_8RowMajorELi0ENSG_INSH_ILi128ELi64EEELi256ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSX_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSF_Li0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSX_NS1F_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1S_S1S_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1R_Li1ENS1W_22PredicatedTileIteratorINS1W_26OutputTileOptimalThreadMapINS1W_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS20_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1V_4warp24FragmentIteratorTensorOpIS1H_S1K_fNSL_IfLi4ELb1EEESX_EENS25_25TileIteratorTensorOpMixedIS1H_S1K_fLi32ELi16ELi8ELi8ELb0EEENS1W_23SharedLoadIteratorMixedINS23_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1V_6thread17LinearCombinationISD_Li8EffLNS2E_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi64ELi128EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi64EEELi1ES10_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtSX_Li0ES1A_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S13_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 728 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel10SparseGemmINS1_11threadblock19SparseMmaMultistageINS1_9GemmShapeILi256ELi128ELi64EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi256ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi256EEELi256ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi64ELi128EEESD_SF_Li0ENSG_INSH_ILi128ELi64EEELi256ENSH_ILi8ELi4EEELi8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESZ_Li16EEELSV_1EfSF_NSA_INSB_ILi256ELi4EEEtNSE_22ColumnMajorInterleavedILi2EEELi1ENS8_30PitchLinearStripminedThreadMapINSH_ILi512ELi2EEELi128ELi8EEENSL_ItLi8ELb0EEELb0ESN_EENSP_INSB_ILi512ELi2EEEtNSE_11ColumnMajorELi0ES19_Li16EEELSV_1ENS4_15SparseMmaPolicyINS1_4warp17SparseMmaTensorOpINS6_ILi64ELi64ELi64EEESD_SR_SD_S12_fSF_NS1G_17MmaTensorOpPolicyINST_9SparseMmaINS6_ILi16ELi8ELi32EEELi32ESD_SF_SD_S1D_fSF_NST_13OpMultiplyAddELNST_12SPFormatType4KindE0EEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1T_S1T_Li1EEELi4EbEENS_8epilogue11threadblock8EpilogueIS7_S1S_Li1ENS1X_22PredicatedTileIteratorINS1X_26OutputTileOptimalThreadMapINS1X_15OutputTileShapeILi128ELi8ELi4ELi1ELi1EEENS21_ILi1ELi8ELi1ELi1ELi8EEELi256ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1W_4warp24FragmentIteratorTensorOpIS1I_S1L_fNSL_IfLi4ELb1EEESF_EENS26_25TileIteratorTensorOpMixedIS1I_S1L_fLi32ELi16ELi8ELi8ELb0EEENS1X_23SharedLoadIteratorMixedINS24_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1W_6thread17LinearCombinationISD_Li8EffLNS2F_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS4_30GemmIdentityThreadblockSwizzleILi3EEELb0EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 728 bytes cmem[0] [74/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/sparse24_apply_dense_output.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_apply_dense_output.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 408 bytes cmem[0] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 408 bytes cmem[0] ptxas warning : Value of minnctapersm for entry _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE is out of range. minnctapersm will be ignored ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 408 bytes cmem[0] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, 408 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 64 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 64 registers, 376 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 408 bytes cmem[0] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 408 bytes cmem[0] ptxas warning : Value of minnctapersm for entry _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE is out of range. minnctapersm will be ignored ptxas info : 1 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 408 bytes cmem[0] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, 408 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 64 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 64 registers, 376 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 64 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 64 registers, 376 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 64 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 64 registers, 376 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb0ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers ptxas info : Compiling entry function '_ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN71_GLOBAL__N__265e8a1a_30_sparse24_apply_dense_output_cu_ac8d307b_135632329sparse24_apply_dense_output_kILb1ELb1EEEvNS_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers [75/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/sparse24_largest_mask_2d.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_largest_mask_2d.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 368 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 368 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 59 registers, 368 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 368 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 400 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 368 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 65 registers, 368 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 59 registers, 368 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 368 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 400 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 368 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 65 registers, 368 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 59 registers, 368 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 368 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 400 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 400 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, 400 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 60 registers /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 368 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb1EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 368 bytes cmem[0] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass10bfloat16_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 59 registers, 368 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _Z27sparse24_largest_mask_2d_cuI14Sp24MaskKernelIN7cutlass6half_tELb0EEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 368 bytes cmem[0] [76/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/sparse24_pack_test.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_pack_test.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_60' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 433 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_52' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 433 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_50' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 433 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_75' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 20 registers, 465 bytes cmem[0] ptxas info : 1 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_80' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 465 bytes cmem[0] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_90' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers ptxas info : 1 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_86' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 465 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_61' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 433 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b' for 'sm_70' ptxas info : Function properties for _ZN62_GLOBAL__N__0180d1c6_21_sparse24_pack_test_cu_ac8d307b_135658724meta_shuffle_test_kernelEN2at27GenericPackedTensorAccessorIlLm3ENS0_16DefaultPtrTraitsElEES3_b 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 465 bytes cmem[0] [77/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/swiglu/cuda/dual_gemm_silu_identity_mul.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/cuda/dual_gemm_silu_identity_mul.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 2 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 1040 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 1040 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1019 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1008 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1008 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 1019 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1008 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1008 bytes cmem[0] ptxas info : 2 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 1040 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 1040 bytes cmem[0] ptxas info : 1019 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1008 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 1008 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers ptxas info : 1019 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1008 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 1008 bytes cmem[0] ptxas info : 866 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 1040 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 1040 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1019 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_20TileIteratorTensorOpIS17_S1A_fSF_EENS1L_18SharedLoadIteratorINS1S_18CompactedThreadMapEfLi32EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 1040 bytes cmem[0] ptxas info : Compiling entry function '_ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass6KernelINS_4gemm6kernel8DualGemmINS1_11threadblock17DualMmaMultistageINS1_9GemmShapeILi128ELi64ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout8RowMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi128ENSH_ILi4ELi8EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi64EEESD_NSE_11ColumnMajorELi0ENSG_INSH_ILi32ELi64EEELi128ESJ_Li8EEESM_Lb0ESN_EENSP_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESZ_Li16EEELSV_1ES10_S13_fSF_NS4_9MmaPolicyINS1_4warp11MmaTensorOpINS6_ILi64ELi32ELi32EEESD_SR_SD_S12_fSF_NS15_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SF_SD_SX_fSF_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1G_Li1EEES1H_Li3ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1F_Li1ENS1L_22PredicatedTileIteratorINS1L_26OutputTileOptimalThreadMapINS1L_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1P_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1K_4warp24FragmentIteratorTensorOpIS17_S1A_fNSL_IfLi4ELb1EEESF_EENS1U_25TileIteratorTensorOpMixedIS17_S1A_fLi32ELi16ELi8ELi8ELb0EEENS1L_23SharedLoadIteratorMixedINS1S_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1K_6thread17LinearCombinationISD_Li8EffLNS23_9ScaleType4KindE1ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEES2A_NS23_14LeftSiLUAndMulISD_Li8ESD_fLS27_2EEENS4_30GemmIdentityThreadblockSwizzleILi2EEELb0ELb1ELb1EEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 114 registers, 1040 bytes cmem[0] [78/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/swiglu/cuda/gemm_fused_operand_sum.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/cuda/gemm_fused_operand_sum.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 178 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_70' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 696 bytes cmem[0], 12 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 178 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 664 bytes cmem[0], 20 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_61' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 664 bytes cmem[0], 8 bytes cmem[2] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_75' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : 178 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 664 bytes cmem[0], 20 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_52' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 664 bytes cmem[0], 20 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_90' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : 2 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_80' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : 178 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 664 bytes cmem[0], 20 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_60' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 664 bytes cmem[0], 8 bytes cmem[2] ptxas info : 178 bytes gmem ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 664 bytes cmem[0], 20 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_50' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 664 bytes cmem[0], 20 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 2 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_10bfloat16_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_20TileIteratorTensorOpIS15_S18_fSX_EENS1J_18SharedLoadIteratorINS1Q_18CompactedThreadMapEfLi32EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi1ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 696 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE' for 'sm_86' ptxas info : Function properties for _ZN7cutlass7Kernel2INS_4gemm6kernel18GemmWithKReductionINS1_11threadblock26MmaWithReductionMultistageINS1_9GemmShapeILi128ELi128ELi32EEENS_9transform11threadblock28PredicatedTileAccessIteratorINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout11ColumnMajorELi1ENS8_29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi128ELi32EEELi128ENSH_ILi8ELi4EEELi8EEENS_5ArrayISD_Li8ELb0EEELb0ENSE_9NoPermuteEEENS9_25RegularTileAccessIteratorISC_SD_NSE_40ColumnMajorTensorOpMultiplicandCongruousILi16ELi64EEELi1ESK_Li16EEELNS_4arch14CacheOperation4KindE1ENSA_INSB_ILi32ELi128EEESD_NSE_8RowMajorELi0ESK_SM_Lb0ESN_EENSP_ISW_SD_NSE_37RowMajorTensorOpMultiplicandCongruousILi16ELi64EEELi0ESK_Li16EEELSV_1EfSX_NS4_9MmaPolicyINS1_4warp24MmaWithReductionTensorOpINS6_ILi64ELi64ELi32EEESD_SR_SD_S10_fSX_NS13_17MmaTensorOpPolicyINST_3MmaINS6_ILi16ELi8ELi16EEELi32ESD_SX_SD_SF_fSX_NST_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELb1ELi1ELb0EbEENSB_ILi0ELi0EEES1E_Li1EEELi4ELNS1_23SharedMemoryClearOptionE0EbEENS_8epilogue11threadblock8EpilogueIS7_S1D_Li1ENS1J_22PredicatedTileIteratorINS1J_26OutputTileOptimalThreadMapINS1J_15OutputTileShapeILi128ELi8ELi2ELi1ELi1EEENS1N_ILi1ELi8ELi1ELi1ELi8EEELi128ELi8ELi16EEESD_Lb0ESN_Lb0EEENS1I_4warp24FragmentIteratorTensorOpIS15_S18_fNSL_IfLi4ELb1EEESX_EENS1S_25TileIteratorTensorOpMixedIS15_S18_fLi32ELi16ELi8ELi8ELb0EEENS1J_23SharedLoadIteratorMixedINS1Q_18CompactedThreadMapEfLi32ELi16ELi8ELi8ELb0EEENS1I_6thread17LinearCombinationISD_Li8EffLNS21_9ScaleType4KindE0ELNS_15FloatRoundStyleE2ESD_EENSB_ILi0ELi8EEELi2ELi1EEENS1J_22EpilogueGemmKReductionIfSD_S7_S1D_Lb1EEENS4_30GemmIdentityThreadblockSwizzleILi8EEEEEEEvNT_6ParamsE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 696 bytes cmem[0], 12 bytes cmem[2] [79/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/swiglu_op.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/swiglu/swiglu_op.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/swiglu_op.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 [80/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/sparse24/sparse24_pack.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_pack.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_70' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 425 bytes cmem[0], 44 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 48 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 425 bytes cmem[0], 44 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 425 bytes cmem[0], 44 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 48 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_50' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 425 bytes cmem[0], 44 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_60' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 425 bytes cmem[0], 44 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 48 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 425 bytes cmem[0], 44 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 425 bytes cmem[0], 44 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 48 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_52' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 425 bytes cmem[0], 44 bytes cmem[2] ptxas info : Function properties for _ZN7cutlass6half_t7convertERKS0_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN7cutlass6half_t7convertERKf 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 133 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 147 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 133 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 132 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 137 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 132 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 131 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 141 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 135 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 132 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 139 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_75' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 134 registers, 457 bytes cmem[0] ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas warning : Value of minnctapersm for entry _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ is out of range. minnctapersm will be ignored ptxas info : 33 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 132 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 135 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 132 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 137 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 128 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 140 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 144 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 135 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 138 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 145 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_86' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 134 registers, 457 bytes cmem[0] ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_61' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 425 bytes cmem[0] ptxas info : 33 bytes gmem, 32 bytes cmem[3] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_80' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 457 bytes cmem[0] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 33 bytes gmem ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass10bfloat16_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_15MetadataCutlassENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_5AbsOpEEEEEvNT_6ParamsET0_T1_ 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_10Causal1122INS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_' for 'sm_90' ptxas info : Function properties for _ZN57_GLOBAL__N__7dcbb2fd_16_sparse24_pack_cu_ac8d307b_135654734sparse24_sparsify_both_ways_kernelIN8xformers4sp2411KernelTypesIN7cutlass6half_tEEENS_18MetadataCuSparseLtENS2_19LargestValuesGreedyINS2_10IdentityOpEEEEEvNT_6ParamsET0_T1_ 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 96 registers [81/82] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/swiglu/cuda/silu_bw_fused.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/cuda/silu_bw_fused.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v --ptxas-options=-O2 --ptxas-options=-allow-expensive-optimizations=true -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 1592 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 386 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 1592 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_52' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 386 bytes cmem[0], 124 bytes cmem[2] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 1592 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 386 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 1592 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_60' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 386 bytes cmem[0], 124 bytes cmem[2] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 1592 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 386 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 1592 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_50' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 386 bytes cmem[0], 124 bytes cmem[2] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 1624 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 418 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 1624 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_70' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 418 bytes cmem[0], 88 bytes cmem[2] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 1624 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 418 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 1624 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_86' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 418 bytes cmem[0], 88 bytes cmem[2] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_90' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 1624 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 418 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 1624 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_80' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 418 bytes cmem[0], 88 bytes cmem[2] ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 1592 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 386 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1592 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 386 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 1592 bytes cmem[0], 124 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_61' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 386 bytes cmem[0], 124 bytes cmem[2] /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=std::size_t, one_sided=true, =0]" at line 73 of /usr/lib64/python3.10/site-packages/torch/include/ATen/core/qualified_name.h ptxas info : 1 bytes gmem ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 1624 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb1EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 418 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE2_clEvEUlN3c108BFloat16ESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE1_clEvEUlN3c104HalfESE_SE_E_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESL_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1624 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE0_clEvEUlfffE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 418 bytes cmem[0] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi3EjLb0EESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 1624 bytes cmem[0], 88 bytes cmem[2] ptxas info : Compiling entry function '_ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_' for 'sm_75' ptxas info : Function properties for _ZN2at6native57_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679845unrolled_elementwise_kernel_for_multi_outputsILi3EZZZN55_INTERNAL_a6f69002_16_silu_bw_fused_cu_ac8d307b_135679857_GLOBAL__N__a6f69002_16_silu_bw_fused_cu_ac8d307b_135679813silu_bw_fusedILb0EEESt5tupleIJNS_6TensorES7_EERKS7_SA_SA_ENKUlvE_clEvENKUlvE_clEvEUldddE_NS_6detail5ArrayIPcLi6EEE23TrivialOffsetCalculatorILi3EjESJ_EEviT0_T1_T2_T3_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 418 bytes cmem[0], 88 bytes cmem[2] [82/82] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/swiglu_packedw.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/xformers/csrc -I/builddir/build/BUILD/xformers-0.0.24/third_party/sputnik -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/include -I/builddir/build/BUILD/xformers-0.0.24/third_party/cutlass/examples -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/xformers/csrc/swiglu/swiglu_packedw.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/swiglu_packedw.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C -D_GLIBCXX_USE_CXX11_ABI=1 g++ -shared -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -Wl,--build-id=sha1 -g -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -Wl,--build-id=sha1 -g -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/attention.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/autograd/matmul.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/matmul.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/sddmm.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/sparse_softmax.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cpu/spmm.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/attention_backward_generic.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/attention_cutlass_rand_uniform.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/attention_forward_generic.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k128.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k128_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k32.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k32_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k64.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k64_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k65536.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k65536_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_bf16_aligned_k96.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k128.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k128_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k32.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k32_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k64.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k64_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k65536.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k65536_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_aligned_k96.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k128.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k128_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k32.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k32_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k64.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k64_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k65536.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f16_notaligned_k65536_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k128.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k128_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k32.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k32_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k64.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k64_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k65536.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_aligned_k65536_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k128.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k128_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k32.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k32_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k64.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k64_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k65536.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassB_f32_notaligned_k65536_dropout.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_bf16_aligned.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f16_aligned.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f16_notaligned.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f32_aligned.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_f32_notaligned.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/decoder.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/fmha/small_k.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/matmul.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/sddmm.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/sddmm2_cuda.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/sparse_softmax.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/cuda/spmm.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/matmul.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/sddmm.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/sparse_softmax.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/attention/spmm.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/boxing_unboxing.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/nvcc_info.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/gemm.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/meta_utils.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_apply.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_apply_dense_output.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_largest_mask_2d.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_pack.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/sparse24/sparse24_pack_test.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/cuda/dual_gemm_silu_identity_mul.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/cuda/gemm_fused_operand_sum.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/cuda/silu_bw_fused.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/swiglu_op.o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/xformers/csrc/swiglu/swiglu_packedw.o -L/usr/lib64/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.1/lib64 -L/usr/lib64 -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -lpython3.10 -o build/lib.linux-aarch64-cpython-310/xformers/_C.so + RPM_EC=0 ++ jobs -p + exit 0 Executing(%install): /bin/sh -e /var/tmp/rpm-tmp.jvA6Co + umask 022 + cd /builddir/build/BUILD + '[' /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64 '!=' / ']' + rm -rf /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64 ++ dirname /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64 + mkdir -p /builddir/build/BUILDROOT + mkdir /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64 + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CFLAGS + CXXFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CXXFLAGS + FFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd xformers-0.0.24 + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + /usr/bin/python3 setup.py install -O1 --skip-build --root /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64 No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda' running install /usr/lib/python3.10/site-packages/setuptools/command/install.py:34: SetuptoolsDeprecationWarning: setup.py install is deprecated. Use build and pip and other standards-based tools. warnings.warn( running install_lib creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64 creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10 creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/_cpp_lib.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/_deprecation_warning.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/attn_bias_utils.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/checkpoint.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/info.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/test.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/utils.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_attn_decoding.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_blocksparse_transformers.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_causal_blocksparse.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_core.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_indexing.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_mem_eff_attention.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_mlp.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_multi_head_dispatch.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_nystrom_utils.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_revnet.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_sddmm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_sequence_parallel_fused.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_sp24.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_swiglu.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_tiled_matmul.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_transformer.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_triton_blocksparse.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_triton_dropout.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_triton_fused_linear.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_triton_layernorm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/benchmark_triton_softmax.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/utils.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/batch_fetch_results.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/batch_submit.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/run_grid_search.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/run_tasks.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/run_with_submitit.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/code copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/code copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code/dataset.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/code copying build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code/model_wrapper.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/code creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components copying build/lib.linux-aarch64-cpython-310/xformers/components/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components copying build/lib.linux-aarch64-cpython-310/xformers/components/activations.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components copying build/lib.linux-aarch64-cpython-310/xformers/components/input_projection.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components copying build/lib.linux-aarch64-cpython-310/xformers/components/multi_head_dispatch.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components copying build/lib.linux-aarch64-cpython-310/xformers/components/patch_embedding.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components copying build/lib.linux-aarch64-cpython-310/xformers/components/residual.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components copying build/lib.linux-aarch64-cpython-310/xformers/components/reversible.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components copying build/lib.linux-aarch64-cpython-310/xformers/components/simplicial_embedding.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/_sputnik_sparse.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/attention_mask.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/attention_patterns.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/base.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/blocksparse.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/compositional.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/core.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/favor.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/fourier_mix.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/global_tokens.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/lambda_layer.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/linformer.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/local.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/nystrom.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/ortho.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/pooling.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/random.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/scaled_dot_product.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/sparsity_config.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/utils.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/visual.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/feature_maps copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/feature_maps copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps/base.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/feature_maps copying build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps/softmax.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/feature_maps creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward copying build/lib.linux-aarch64-cpython-310/xformers/components/feedforward/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward copying build/lib.linux-aarch64-cpython-310/xformers/components/feedforward/base.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward copying build/lib.linux-aarch64-cpython-310/xformers/components/feedforward/conv_mlp.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward copying build/lib.linux-aarch64-cpython-310/xformers/components/feedforward/fused_mlp.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward copying build/lib.linux-aarch64-cpython-310/xformers/components/feedforward/mixture_of_experts.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward copying build/lib.linux-aarch64-cpython-310/xformers/components/feedforward/mlp.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding copying build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding copying build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding/base.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding copying build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding/param.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding copying build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding/rotary.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding copying build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding/sine.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding copying build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding/vocab.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory copying build/lib.linux-aarch64-cpython-310/xformers/factory/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory copying build/lib.linux-aarch64-cpython-310/xformers/factory/block_configs.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory copying build/lib.linux-aarch64-cpython-310/xformers/factory/block_factory.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory copying build/lib.linux-aarch64-cpython-310/xformers/factory/hydra_helper.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory copying build/lib.linux-aarch64-cpython-310/xformers/factory/model_factory.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory copying build/lib.linux-aarch64-cpython-310/xformers/factory/weight_init.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers copying build/lib.linux-aarch64-cpython-310/xformers/helpers/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers copying build/lib.linux-aarch64-cpython-310/xformers/helpers/hierarchical_configs.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers copying build/lib.linux-aarch64-cpython-310/xformers/helpers/test_utils.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers copying build/lib.linux-aarch64-cpython-310/xformers/helpers/timm_sparse_attention.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/common.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/differentiable_collectives.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/indexing.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/modpar_layers.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/rmsnorm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/rope_padded.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/seqpar.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/sequence_parallel_fused_ops.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/sp24.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/swiglu_op.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/tiled_matmul.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops copying build/lib.linux-aarch64-cpython-310/xformers/ops/unbind.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton copying build/lib.linux-aarch64-cpython-310/xformers/ops/_triton/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton copying build/lib.linux-aarch64-cpython-310/xformers/ops/_triton/k_index_select_cat.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton copying build/lib.linux-aarch64-cpython-310/xformers/ops/_triton/k_scaled_index_add.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton copying build/lib.linux-aarch64-cpython-310/xformers/ops/_triton/rmsnorm_kernels.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton copying build/lib.linux-aarch64-cpython-310/xformers/ops/_triton/rope_padded_kernels.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton copying build/lib.linux-aarch64-cpython-310/xformers/ops/_triton/sequence_parallel_fused_kernels.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton copying build/lib.linux-aarch64-cpython-310/xformers/ops/_triton/tiled_matmul_kernels.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/attn_bias.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/common.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/cutlass.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/decoder.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/dispatch.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/flash.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/small_k.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/triton.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha copying build/lib.linux-aarch64-cpython-310/xformers/ops/fmha/triton_splitk.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler copying build/lib.linux-aarch64-cpython-310/xformers/profiler/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler copying build/lib.linux-aarch64-cpython-310/xformers/profiler/api.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler copying build/lib.linux-aarch64-cpython-310/xformers/profiler/device_limits.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler copying build/lib.linux-aarch64-cpython-310/xformers/profiler/profiler.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler copying build/lib.linux-aarch64-cpython-310/xformers/profiler/profiler_dcgm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler copying build/lib.linux-aarch64-cpython-310/xformers/profiler/profiler_dcgm_impl.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler copying build/lib.linux-aarch64-cpython-310/xformers/profiler/slow_ops_profiler.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse copying build/lib.linux-aarch64-cpython-310/xformers/sparse/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse copying build/lib.linux-aarch64-cpython-310/xformers/sparse/_csr_ops.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse copying build/lib.linux-aarch64-cpython-310/xformers/sparse/blocksparse_tensor.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse copying build/lib.linux-aarch64-cpython-310/xformers/sparse/csr_tensor.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse copying build/lib.linux-aarch64-cpython-310/xformers/sparse/utils.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/dropout.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/fused_linear_layer.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/k_activations.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/k_dropout.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/k_fused_matmul_bw.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/k_fused_matmul_fw.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/k_layer_norm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/k_softmax.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/layer_norm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/softmax.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton copying build/lib.linux-aarch64-cpython-310/xformers/triton/vararg_kernel.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/bert_padding.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/flash_attn_interface.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/flash_attn_triton.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/flash_attn_triton_og.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/flash_blocksparse_attention.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/flash_blocksparse_attn_interface.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/fused_softmax.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/layers copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/layers copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers/patch_embed.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/layers copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers/rotary.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/layers creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/losses copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/losses/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/losses copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/losses/cross_entropy.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/losses creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/baichuan.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/bert.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/bigcode.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/falcon.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/gpt.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/gpt_neox.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/gptj.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/llama.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/opt.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models/vit.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules/block.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules/embedding.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules/mha.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules/mlp.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/activations.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/fused_dense.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/layer_norm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/rms_norm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton/cross_entropy.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton/k_activations.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton/layernorm.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton/linear.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton/mlp.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton/rotary.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton creating /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils/__init__.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils/benchmark.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils/distributed.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils/generation.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils copying build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils/pretrained.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils copying build/lib.linux-aarch64-cpython-310/xformers/_C_flashattention.so -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/_C.so -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/cpp_lib.json -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers copying build/lib.linux-aarch64-cpython-310/xformers/version.py -> /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_cpp_lib.py to _cpp_lib.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_deprecation_warning.py to _deprecation_warning.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/attn_bias_utils.py to attn_bias_utils.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/checkpoint.py to checkpoint.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/info.py to info.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/test.py to test.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/utils.py to utils.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_attn_decoding.py to benchmark_attn_decoding.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_blocksparse_transformers.py to benchmark_blocksparse_transformers.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_causal_blocksparse.py to benchmark_causal_blocksparse.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_core.py to benchmark_core.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_indexing.py to benchmark_indexing.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_mem_eff_attention.py to benchmark_mem_eff_attention.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py to benchmark_mem_eff_attn_decoder.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_mlp.py to benchmark_mlp.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_multi_head_dispatch.py to benchmark_multi_head_dispatch.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_nystrom_utils.py to benchmark_nystrom_utils.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_revnet.py to benchmark_revnet.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_sddmm.py to benchmark_sddmm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_sequence_parallel_fused.py to benchmark_sequence_parallel_fused.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_sp24.py to benchmark_sp24.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_swiglu.py to benchmark_swiglu.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_tiled_matmul.py to benchmark_tiled_matmul.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_transformer.py to benchmark_transformer.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_triton_blocksparse.py to benchmark_triton_blocksparse.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_triton_dropout.py to benchmark_triton_dropout.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_triton_fused_linear.py to benchmark_triton_fused_linear.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_triton_layernorm.py to benchmark_triton_layernorm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/benchmark_triton_softmax.py to benchmark_triton_softmax.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/utils.py to utils.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/batch_fetch_results.py to batch_fetch_results.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/batch_submit.py to batch_submit.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/run_grid_search.py to run_grid_search.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/run_tasks.py to run_tasks.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/run_with_submitit.py to run_with_submitit.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/code/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/code/dataset.py to dataset.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/benchmarks/LRA/code/model_wrapper.py to model_wrapper.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/activations.py to activations.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/input_projection.py to input_projection.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/multi_head_dispatch.py to multi_head_dispatch.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/patch_embedding.py to patch_embedding.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/residual.py to residual.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/reversible.py to reversible.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/simplicial_embedding.py to simplicial_embedding.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/_sputnik_sparse.py to _sputnik_sparse.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/attention_mask.py to attention_mask.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/attention_patterns.py to attention_patterns.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/base.py to base.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/blocksparse.py to blocksparse.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/compositional.py to compositional.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/core.py to core.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/favor.py to favor.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/fourier_mix.py to fourier_mix.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/global_tokens.py to global_tokens.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/lambda_layer.py to lambda_layer.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/linformer.py to linformer.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/local.py to local.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/nystrom.py to nystrom.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/ortho.py to ortho.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/pooling.py to pooling.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/random.py to random.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/scaled_dot_product.py to scaled_dot_product.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/sparsity_config.py to sparsity_config.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/utils.py to utils.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/visual.py to visual.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/feature_maps/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/feature_maps/base.py to base.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/attention/feature_maps/softmax.py to softmax.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward/base.py to base.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward/conv_mlp.py to conv_mlp.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward/fused_mlp.py to fused_mlp.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward/mixture_of_experts.py to mixture_of_experts.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/feedforward/mlp.py to mlp.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding/base.py to base.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding/param.py to param.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding/rotary.py to rotary.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding/sine.py to sine.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/components/positional_embedding/vocab.py to vocab.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory/block_configs.py to block_configs.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory/block_factory.py to block_factory.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory/hydra_helper.py to hydra_helper.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory/model_factory.py to model_factory.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/factory/weight_init.py to weight_init.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers/hierarchical_configs.py to hierarchical_configs.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers/test_utils.py to test_utils.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/helpers/timm_sparse_attention.py to timm_sparse_attention.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/common.py to common.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/differentiable_collectives.py to differentiable_collectives.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/indexing.py to indexing.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/modpar_layers.py to modpar_layers.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/rmsnorm.py to rmsnorm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/rope_padded.py to rope_padded.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/seqpar.py to seqpar.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/sequence_parallel_fused_ops.py to sequence_parallel_fused_ops.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/sp24.py to sp24.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/swiglu_op.py to swiglu_op.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/tiled_matmul.py to tiled_matmul.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/unbind.py to unbind.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton/k_index_select_cat.py to k_index_select_cat.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton/k_scaled_index_add.py to k_scaled_index_add.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton/rmsnorm_kernels.py to rmsnorm_kernels.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton/rope_padded_kernels.py to rope_padded_kernels.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton/sequence_parallel_fused_kernels.py to sequence_parallel_fused_kernels.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/_triton/tiled_matmul_kernels.py to tiled_matmul_kernels.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/attn_bias.py to attn_bias.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/common.py to common.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/cutlass.py to cutlass.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/decoder.py to decoder.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/dispatch.py to dispatch.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/flash.py to flash.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/small_k.py to small_k.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/triton.py to triton.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/ops/fmha/triton_splitk.py to triton_splitk.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler/api.py to api.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler/device_limits.py to device_limits.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler/profiler.py to profiler.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler/profiler_dcgm.py to profiler_dcgm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler/profiler_dcgm_impl.py to profiler_dcgm_impl.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/profiler/slow_ops_profiler.py to slow_ops_profiler.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse/_csr_ops.py to _csr_ops.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse/blocksparse_tensor.py to blocksparse_tensor.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse/csr_tensor.py to csr_tensor.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/sparse/utils.py to utils.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/dropout.py to dropout.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/fused_linear_layer.py to fused_linear_layer.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/k_activations.py to k_activations.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/k_dropout.py to k_dropout.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/k_fused_matmul_bw.py to k_fused_matmul_bw.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/k_fused_matmul_fw.py to k_fused_matmul_fw.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/k_layer_norm.py to k_layer_norm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/k_softmax.py to k_softmax.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/layer_norm.py to layer_norm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/softmax.py to softmax.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/triton/vararg_kernel.py to vararg_kernel.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/bert_padding.py to bert_padding.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/flash_attn_interface.py to flash_attn_interface.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/flash_attn_triton.py to flash_attn_triton.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/flash_attn_triton_og.py to flash_attn_triton_og.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/flash_blocksparse_attention.py to flash_blocksparse_attention.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/flash_blocksparse_attn_interface.py to flash_blocksparse_attn_interface.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/fused_softmax.py to fused_softmax.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/layers/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/layers/patch_embed.py to patch_embed.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/layers/rotary.py to rotary.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/losses/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/losses/cross_entropy.py to cross_entropy.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/baichuan.py to baichuan.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/bert.py to bert.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/bigcode.py to bigcode.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/falcon.py to falcon.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/gpt.py to gpt.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/gpt_neox.py to gpt_neox.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/gptj.py to gptj.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/llama.py to llama.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/opt.py to opt.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/models/vit.py to vit.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules/block.py to block.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules/embedding.py to embedding.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules/mha.py to mha.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/modules/mlp.py to mlp.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/activations.py to activations.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/fused_dense.py to fused_dense.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/layer_norm.py to layer_norm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/rms_norm.py to rms_norm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton/cross_entropy.py to cross_entropy.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton/k_activations.py to k_activations.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton/layernorm.py to layernorm.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton/linear.py to linear.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton/mlp.py to mlp.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/ops/triton/rotary.py to rotary.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils/__init__.py to __init__.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils/benchmark.py to benchmark.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils/distributed.py to distributed.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils/generation.py to generation.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_flash_attn/utils/pretrained.py to pretrained.cpython-310.pyc byte-compiling /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/version.py to version.cpython-310.pyc writing byte-compilation script '/tmp/tmpe9lr7k7k.py' /usr/bin/python3 /tmp/tmpe9lr7k7k.py removing /tmp/tmpe9lr7k7k.py running install_egg_info running egg_info creating xformers.egg-info writing xformers.egg-info/PKG-INFO writing dependency_links to xformers.egg-info/dependency_links.txt writing requirements to xformers.egg-info/requires.txt writing top-level names to xformers.egg-info/top_level.txt writing manifest file 'xformers.egg-info/SOURCES.txt' reading manifest file 'xformers.egg-info/SOURCES.txt' reading manifest template 'MANIFEST.in' no previously-included directories found matching 'third_party/flash-attention/csrc/cutlass/docs/' no previously-included directories found matching 'third_party/flash-attention/csrc/cutlass/test/' no previously-included directories found matching 'third_party/flash-attention/csrc/cutlass/tools/' no previously-included directories found matching 'third_party/flash-attention/csrc/cutlass/media/' no previously-included directories found matching 'third_party/flash-attention/csrc/cutlass/python/' adding license file 'LICENSE' writing manifest file 'xformers.egg-info/SOURCES.txt' Copying xformers.egg-info to /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers-0.0.24-py3.10.egg-info running install_scripts + rm -rfv /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/bin/__pycache__ + /usr/bin/find-debuginfo -j80 --strict-build-id -m -i --build-id-seed 0.0.24-1.an23 --unique-debug-suffix -0.0.24-1.an23.aarch64 --unique-debug-src-base python-xformers-0.0.24-1.an23.aarch64 --run-dwz --dwz-low-mem-die-limit 10000000 --dwz-max-die-limit 50000000 -S debugsourcefiles.list /builddir/build/BUILD/xformers-0.0.24 extracting debug info from /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_C_flashattention.so extracting debug info from /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10/site-packages/xformers/_C.so original debug info size: 40320kB, size after compression: 30928kB /usr/bin/sepdebugcrcfix: Updated 2 CRC32s, 0 CRC32s did match. 424 blocks + /usr/lib/rpm/check-buildroot + /usr/lib/rpm/anolis/brp-ldconfig + COMPRESS='zstd -f --rm -19 -T0' + COMPRESS_EXT=.zst + /usr/lib/rpm/brp-compress + /usr/lib/rpm/anolis/brp-strip-lto /usr/bin/strip + /usr/lib/rpm/brp-strip-static-archive /usr/bin/strip + /usr/lib/rpm/check-rpaths + /usr/lib/rpm/brp-remove-la-files + /usr/lib/rpm/anolis/clean_perl + /usr/lib/rpm/anolis/check_elf_files + /usr/lib/rpm/anolis/brp-mangle-shebangs + /usr/lib/rpm/anolis/remove-info-dir + /usr/lib/rpm/anolis/check-desktop-files + /usr/lib/rpm/anolis/brp-python-bytecompile '' 1 0 Bytecompiling .py files below /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib64/python3.10 using python3.10 Bytecompiling .py files below /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/lib/debug/usr/lib64/python3.10 using python3.10 + /usr/lib/rpm/anolis/brp-python-hardlink Processing files: python3-xformers-0.0.24-1.an23.aarch64 Executing(%doc): /bin/sh -e /var/tmp/rpm-tmp.8Z89cq + umask 022 + cd /builddir/build/BUILD + cd xformers-0.0.24 + DOCDIR=/builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/share/doc/python3-xformers + export LC_ALL=C + LC_ALL=C + export DOCDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/share/doc/python3-xformers + cp -pr README.md /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/share/doc/python3-xformers + RPM_EC=0 ++ jobs -p + exit 0 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.O6SEZZ + umask 022 + cd /builddir/build/BUILD + cd xformers-0.0.24 + LICENSEDIR=/builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/share/licenses/python3-xformers + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/share/licenses/python3-xformers + cp -pr LICENSE /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64/usr/share/licenses/python3-xformers + RPM_EC=0 ++ jobs -p + exit 0 Provides: python-xformers = 0.0.24-1.an23 python3-xformers = 0.0.24-1.an23 python3-xformers(aarch-64) = 0.0.24-1.an23 python3.10-xformers = 0.0.24-1.an23 python3.10dist(xformers) = 0.0.24 python3dist(xformers) = 0.0.24 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PartialHardlinkSets) <= 4.0.4-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Requires: ld-linux-aarch64.so.1()(64bit) ld-linux-aarch64.so.1(GLIBC_2.17)(64bit) libc.so.6()(64bit) libc.so.6(GLIBC_2.17)(64bit) libc.so.6(GLIBC_2.32)(64bit) libc10.so()(64bit) libc10_cuda.so()(64bit) libcudart.so.12()(64bit) libcudart.so.12(libcudart.so.12)(64bit) libgcc_s.so.1()(64bit) libgcc_s.so.1(GCC_3.0)(64bit) libgcc_s.so.1(GCC_3.3.1)(64bit) libm.so.6()(64bit) libm.so.6(GLIBC_2.17)(64bit) libm.so.6(GLIBC_2.27)(64bit) libm.so.6(GLIBC_2.29)(64bit) libpython3.10.so.1.0()(64bit) libstdc++.so.6()(64bit) libstdc++.so.6(CXXABI_1.3)(64bit) libstdc++.so.6(CXXABI_1.3.11)(64bit) libstdc++.so.6(CXXABI_1.3.13)(64bit) libstdc++.so.6(CXXABI_1.3.2)(64bit) libstdc++.so.6(CXXABI_1.3.3)(64bit) libstdc++.so.6(CXXABI_1.3.5)(64bit) libstdc++.so.6(CXXABI_1.3.9)(64bit) libstdc++.so.6(GLIBCXX_3.4)(64bit) libstdc++.so.6(GLIBCXX_3.4.11)(64bit) libstdc++.so.6(GLIBCXX_3.4.14)(64bit) libstdc++.so.6(GLIBCXX_3.4.15)(64bit) libstdc++.so.6(GLIBCXX_3.4.18)(64bit) libstdc++.so.6(GLIBCXX_3.4.20)(64bit) libstdc++.so.6(GLIBCXX_3.4.21)(64bit) libstdc++.so.6(GLIBCXX_3.4.26)(64bit) libstdc++.so.6(GLIBCXX_3.4.29)(64bit) libstdc++.so.6(GLIBCXX_3.4.30)(64bit) libstdc++.so.6(GLIBCXX_3.4.9)(64bit) libtorch_cpu.so()(64bit) libtorch_cuda.so()(64bit) libtorch_python.so()(64bit) python(abi) = 3.10 python3.10dist(numpy) python3.10dist(torch) >= 2.1 rtld(GNU_HASH) Obsoletes: python-xformers < 0.0.24-1.an23 Processing files: python-xformers-debugsource-0.0.24-1.an23.aarch64 Provides: python-xformers-debugsource = 0.0.24-1.an23 python-xformers-debugsource(aarch-64) = 0.0.24-1.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Processing files: python3-xformers-debuginfo-0.0.24-1.an23.aarch64 Provides: debuginfo(build-id) = 7a84a0f2785ed7ef3405aeb70f5169bb5acf4dcd debuginfo(build-id) = fd839caf41f885b9f524879225842ba930053aa4 python-xformers-debuginfo = 0.0.24-1.an23 python3-xformers-debuginfo = 0.0.24-1.an23 python3-xformers-debuginfo(aarch-64) = 0.0.24-1.an23 python3.10-xformers-debuginfo = 0.0.24-1.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Recommends: python-xformers-debugsource(aarch-64) = 0.0.24-1.an23 Checking for unpackaged file(s): /usr/lib/rpm/check-files /builddir/build/BUILDROOT/python-xformers-0.0.24-1.an23.aarch64 Wrote: /builddir/build/RPMS/python-xformers-debugsource-0.0.24-1.an23.aarch64.rpm Wrote: /builddir/build/RPMS/python3-xformers-debuginfo-0.0.24-1.an23.aarch64.rpm Wrote: /builddir/build/RPMS/python3-xformers-0.0.24-1.an23.aarch64.rpm Child return code was: 0